Riemannian metric from manifold learning

library(tidyverse)
library(dimRed)
# library(dtplyr)
library(reticulate)
library(viridis)
# library(ks)
library(hdrcde)
library(igraph)
# library(seurat)
library(matrixcalc)
library(akima)
Jmisc::sourceAll(here::here("R/sources"))
## Loading...
##   /Users/fche0019/git/kderm/R/sources/add_global_label.R 
##   /Users/fche0019/git/kderm/R/sources/annHLLE.R 
##   /Users/fche0019/git/kderm/R/sources/annIsomap.R 
##   /Users/fche0019/git/kderm/R/sources/annLaplacianEigenmaps.R 
##   /Users/fche0019/git/kderm/R/sources/annLLE.R 
##   /Users/fche0019/git/kderm/R/sources/cal_recall.R 
##   /Users/fche0019/git/kderm/R/sources/calc_ann_table.R 
##   /Users/fche0019/git/kderm/R/sources/dimRedResult-class.R 
##   /Users/fche0019/git/kderm/R/sources/dr_quality.R 
##   /Users/fche0019/git/kderm/R/sources/hdrscatterplot.R 
##   /Users/fche0019/git/kderm/R/sources/makeKNNgraph.R 
##   /Users/fche0019/git/kderm/R/sources/metricML.R 
##   /Users/fche0019/git/kderm/R/sources/mnist_anntable_plotting.R 
## Done

Smart meter data for one household

# load(here::here("data/spdemand_3639id336tow.rda"))
# nid <- 1
# ntow <- 336
# 
# train <- spdemand %>%
#   lazy_dt() %>%
#   filter(tow <= ntow,
#          # id <= sort(unique(spdemand[,id]))[nid],
#          id == 1003) %>%
#   dplyr::select(-id, -tow) %>%
#   data.table::as.data.table()
train <- readRDS(here::here("data/spdemand_1id336tow_train.rds"))
(N <- nrow(train))
## [1] 336

Metric learning

# Parameters fixed
x <- train
s <- 2
k <- 20
method <- "annIsomap"
annmethod <- "kdtree"
distance <- "euclidean"
treetype <- "kd"
searchtype <- "radius" # change searchtype for radius search based on `radius`, or KNN search based on `k`
radius <- .4

Radius searching with k-d trees

metric_isomap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype)
summary(metric_isomap)
##                Length Class     Mode   
## embedding         672 -none-    numeric
## rmetric          1344 -none-    numeric
## weighted_graph     10 igraph    list   
## adj_matrix     112896 dgCMatrix S4

The function metricML() returns a list of

  • the embedding coordinates embedding of
  • the embedding metric rmetric for each point \(p \in x\) as an array
  • the weighted neighborhood graph as an igraph object weighted_graph
  • the sparse adjacency matrix from the graph adj_matrix

Embedding plots

# Comparison of Isomap embedding plot
par(mfrow = c(1, 2))
plot(metric_isomap$embedding, col = viridis::viridis(24)) # metricML
# plot(e@data@data, col = viridis::viridis(24)) # embedding from dimRed, same as metricML()
emb_isomap <-
  feather::read_feather(here::here("data/embedding_isomap_1id336tow.feather"))
plot(emb_isomap$`0`, emb_isomap$`1`, col = viridis::viridis(24)) # embedding from megaman, radius = 0.4

Riemannian metric

# Use adjacency matrix as input for metricML and dimRed, then the embedding should be close, as well as the Riemannian metric
metric_isomap$adj_matrix[1:10, 1:10]# renormalized adjacency matrix
## 10 x 10 sparse Matrix of class "dgCMatrix"
##                                                                              
##  [1,] 1.0000000 0.25392915 0.1208264 .         .         .          .        
##  [2,] 0.2539292 1.00000000 0.3342501 0.1775332 0.1076027 0.09006124 .        
##  [3,] 0.1208264 0.33425008 1.0000000 0.3861779 0.2245130 0.16488133 0.1203864
##  [4,] .         0.17753315 0.3861779 1.0000000 0.4971605 0.30586322 0.2255448
##  [5,] .         0.10760270 0.2245130 0.4971605 1.0000000 0.40429943 0.3473452
##  [6,] .         0.09006124 0.1648813 0.3058632 0.4042994 1.00000000 0.4308813
##  [7,] .         .          0.1203864 0.2255448 0.3473452 0.43088129 1.0000000
##  [8,] .         .          0.1020702 0.1884481 0.2616027 0.31973425 0.4748913
##  [9,] .         0.09017866 0.1700980 0.3170741 0.4422321 0.61829795 0.4195736
## [10,] .         0.09494747 0.1670566 0.3008233 0.4007255 0.32522372 0.2918485
##                                      
##  [1,] .         .          .         
##  [2,] .         0.09017866 0.09494747
##  [3,] 0.1020702 0.17009799 0.16705665
##  [4,] 0.1884481 0.31707414 0.30082327
##  [5,] 0.2616027 0.44223208 0.40072551
##  [6,] 0.3197343 0.61829795 0.32522372
##  [7,] 0.4748913 0.41957363 0.29184850
##  [8,] 1.0000000 0.29214049 0.22131497
##  [9,] 0.2921405 1.00000000 0.38466228
## [10,] 0.2213150 0.38466228 1.00000000
metric_isomap$weighted_graph %>% is.connected()
## [1] TRUE
riem_isomap <- metric_isomap$rmetric
riem_isomap[,,1] %>% 
  # isSymmetric()
  matrixcalc::is.positive.definite() # FALSE
## [1] TRUE

The Riemannian metric from the metricML() function is now positive definite for all points.

Now compare it with the megaman output.

np = import("numpy")
H_isomap <- np$load(here::here("data/hmatrix_isomap_1id336tow.npy"))
H_isomap[1,,, drop=TRUE] %>% 
  matrixcalc::is.positive.definite() # TRUE
## [1] TRUE
pd <- rep(NA, N)
for (i in 1:N) {
  pd[i] <- 
    riem_isomap[,,i] %>%  # R
    # H_isomap[i,,, drop=TRUE] %>% # Python
      # isSymmetric()
      matrixcalc::is.positive.definite() # FALSE
}
pd %>% sum
## [1] 336
# determinant(riem_isomap[,,i])
# mat <- riem_isomap[,,i]
# eigen(mat)

Debug: riemmanien metric not positive definite (DONE)

Question: footnote on P15 of the paper, how to decide the constant c?

In the case of heat kernel (1), c = 1/4, which - crucially - is independent of the dimension of M.

# Function for graph Laplacian
# Input: W: N*N weight matrix for the neighborhood graph, radius: bandwidth parameter
# Output: L: N*N graph Laplacian matrix
Laplacian <- function(W, radius, lambda = 1){
  
  D <- Matrix::Diagonal(x = rowSums(W)^(-lambda)) # inverse of a diagonal matrix
  W1 <- D %*% W %*% D
  D1 <- Matrix::Diagonal(x = 1 / rowSums(W1)) # inverse of Tn1
  L <- 1 / (radius^2 / 4) * (D1 %*% W1 - Matrix::Diagonal(nrow(W))) # c=1/4 for heat kernel, depending on the choice of weights, P15 footnote
  
  return(L)
}
# Function for Riemannian metric for each point
# The Riemannian metric and its dual associated with an embedding Y. 
# The intrinsic dimension d
# The Riemannian metric is currently denoted by G, its dual metric by H, and the Laplacian by L. 
# G at each point is the matrix inverse of H.
riemann_metric <- function(Y, laplacian, d){
  
  # TODO: add dimension check for all inputs
  
  H <- array(NA, dim = c(d, d, nrow(Y)))
  
  for (i in 1:d) {
    for (j in i:d) {
      yij <- Y[, i] * Y[, j]
      H[i, j, ] <- as.vector(0.5 * (laplacian %*% yij - Y[, i] * (laplacian %*% Y[, j]) - Y[, j] * (laplacian %*% Y[, i])))
      H[j, i, ] <- H[i, j, ] # symmetric matrix
    }
  }
  
  ## Pseudo inverse of H gives the final embedding metric h
  ## Array H corresponds to \tilde{H} in Step 4(a)
  ## The embedding metric H is the pseudo inverse of \tilde{H}
  for (i in 1:nrow(Y)) {
    Hsvals <- eigen(H[ , ,i])
    Huu <- Hsvals$vectors
    Hvv <- Hsvals$values[1:d] # top d largest eigenvalues, already sorted in decreasing order
    Hvv1 <- diag(x = 1 / Hvv)
    H[ , ,i] <- Huu %*% Hvv1 %*% t(Huu)
    H[, , i] <- 0.5 * (H[, , i] + t(H[, , i])) # fix not symmetric issue
  }

  return(H)
}

Variable kernel density estimate

For multivariate data, the variable kernel density estimate is given by \[\hat{f}(x) = n^{-1} \sum_i K_H (x - X_i).\]

# 2-d kde
mkde <- function (x, h) {
  # Data is a matrix of n*d
  # H is an array of dimension (d,d,n)
  start <- Sys.time()
  
  n <- nrow(x)
  if (dim(x)[2] < 2)
    stop("Data matrix has only one variable.")
  if (any(!is.finite(x))) 
    stop("Missing or infinite values in the data are not allowed! ")
  if (!all.equal(nrow(h), ncol(h), dim(x)[2]))
    stop("The bandwidth matrix is of wrong dimension. ")
  
  s <- rep(0, n)
  y <- rep(0, n)
  for (i in 1:n) {
    for (j in 1:n){
      s[i] <- s[i] + abs(det(h[,,i]))^(-1/2) * exp(- 1/2 * t(x[i,] - x[j,]) %*% solve(h[,,i]) %*% (x[i,] - x[j,]))
    }
    y[i] <- s[i] / (n * 2 * pi)
  }
  print(Sys.time() - start) 

  return(y)
}

Test the variable KDE function with megaman output

If we use the embedding x and the Riemannian metric h from megaman, we could produce contour plots.

x <- cbind(emb_isomap$`0`, emb_isomap$`1`)
pyriem_isomap <- array(NA, dim = c(s, s, N))
for(i in 1:N){
  pyriem_isomap[,,i] <- H_isomap[i,,]
}
f <- mkde(x = x, h = pyriem_isomap)
## Time difference of 5.880567 secs
summary(f)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1393  0.4059  0.8692  0.8075  1.1473  1.5196
# Top 20 index with the lowest densities
head(order(f), n=20)
##  [1] 326  38 134 182 324 135 327 325  37 133  39 278 181 328  86 277 230 144  85
## [20] 183

Contour plot for megaman

embed_den <- as_tibble(cbind(x = x[,1], y = x[,2], z = f)) %>%
  drop_na()
# library(akima)
pixel <- 100
lenbreak <- 5
akima.spl <- interp(embed_den$x, embed_den$y, embed_den$z, nx=pixel, ny=pixel, linear=FALSE)
p.zmin <- min(embed_den$z,na.rm=TRUE)
p.zmax <- max(embed_den$z,na.rm=TRUE)
breaks <- pretty(c(p.zmin,p.zmax), lenbreak)
# hdrcde
hdrscatterplot(embed_den$x, embed_den$y, noutliers = 10)

hdrinfo <- hdr.2d(embed_den$x, embed_den$y, prob = c(50, 95, 99))
plot.hdr2d(hdrinfo)
# Contour
contour(akima.spl, main = "smooth interp(*, linear = FALSE)", col = viridis(length(breaks)-1), levels=breaks, add=TRUE)
points(embed_den, pch = 20)

filled.contour(akima.spl, color.palette = viridis,
               plot.axes = { axis(1); axis(2);
                 title("smooth interp(*, linear = FALSE)");
                 points(embed_den, pch = 3, col= hcl(c=20, l = 10))})

Contour plot after interpolation for R metricML()

Now apply the mkde() function to the output of metricML() written in R.

x <- metric_isomap$embedding
riem_isomap <- metric_isomap$rmetric
f <- mkde(x = x, h = riem_isomap)
## Time difference of 5.768383 secs
summary(f)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0009714 0.0019742 0.0023836 0.0026091 0.0032485 0.0053262
head(order(f), n=20)
##  [1] 298 297 145 300 296 250 249  98 198 152  55  56 251 287 299 146 199 200 193
## [20] 335

We can check the outliers based on density f. In R, the outliers are different from megaman output.